source("../rscripts/package.R")
Each GF binds to a receptor and triggers a cascade that includes ERK plus others. But it is important for the cell to respond adequatly to the signal, e.g. proliferation vs differentiation. Each GF induces a different type of response when applied in a sustained fashion:
Clustering gives us ways to see these different behaviours, but we would like a way to put numbers on these differences in behaviour, maybe eventually to automatically detect them.
Data look like:
# Remove unnecessary columns for clarity
head(Yanni)
## Condition Label Ratio RealTime Metadata_Series Metadata_T
## 1: E-0.25 12_0002 1183.396 0 12 0
## 2: E-0.25 12_0002 1187.431 2 12 1
## 3: E-0.25 12_0002 1172.491 4 12 2
## 4: E-0.25 12_0002 1176.407 6 12 3
## 5: E-0.25 12_0002 1172.421 8 12 4
## 6: E-0.25 12_0002 1167.917 10 12 5
## TrackObjects_Label Stim_All_Ch Stim_All_S
## 1: 2 EGF 0.25 ng/ml sust S12: EGF 0.25 ng/ml sust
## 2: 2 EGF 0.25 ng/ml sust S12: EGF 0.25 ng/ml sust
## 3: 2 EGF 0.25 ng/ml sust S12: EGF 0.25 ng/ml sust
## 4: 2 EGF 0.25 ng/ml sust S12: EGF 0.25 ng/ml sust
## 5: 2 EGF 0.25 ng/ml sust S12: EGF 0.25 ng/ml sust
## 6: 2 EGF 0.25 ng/ml sust S12: EGF 0.25 ng/ml sust
del.cols <- names(Yanni)[5:9]
Yanni[, (del.cols) := NULL]
Yanni
## Condition Label Ratio RealTime
## 1: E-0.25 12_0002 1183.396 0
## 2: E-0.25 12_0002 1187.431 2
## 3: E-0.25 12_0002 1172.491 4
## 4: E-0.25 12_0002 1176.407 6
## 5: E-0.25 12_0002 1172.421 8
## ---
## 195640: N-250 14_0051 1278.354 352
## 195641: N-250 14_0051 1276.160 354
## 195642: N-250 14_0051 1282.459 356
## 195643: N-250 14_0051 1280.433 358
## 195644: N-250 14_0051 1282.518 360
ggplot(Yanni, aes(y=Ratio, x=RealTime)) + geom_line(aes(group=Label)) + facet_wrap(~Condition) + scale_y_continuous(limits = c(1000, 1750)) + scale_x_continuous(limits = c(0,200)) + stat_summary(fun.y=mean, geom="line", colour = "blue", size = 1.5) + theme(text = element_text(size = 25))
Idea: Hclust, Kmeans… Use the clustering quality criteria (David-Bouldin, Dunn…) as a measure of distance between the clusters.
Here Kmeans/Kmedoids seem appropriate, we guess that the number of clusters should be no more than let’s say 10.
Prior to clustering, normalize data since the computations of distances between TS always relie on some kind of Euclidean distance. We use the fold-change “Korean way”, which normalizes in a per trajectory fashion.
# Normalization
Yanni <- myNorm(in.dt = Yanni, in.meas.col = "Ratio", in.rt.min = 0, in.rt.max = 36, in.by.cols = c("Condition", "Label"), in.type = "fold.change")
# X trimming
Yanni <- Yanni[RealTime <= 200]
Erase small variations in the signal that are not relevant for distance computation, smooth out high frequency variations (noise).
Yanni[, Ratio.norm.smooth := rollex(Ratio.norm, k = 5), by = c("Condition", "Label")]
Yanni
## Condition Label Ratio RealTime Ratio.norm Ratio.norm.smooth
## 1: E-0.25 12_0002 1183.396 0 1.0072021 1.0056096
## 2: E-0.25 12_0002 1187.431 2 1.0106363 1.0042921
## 3: E-0.25 12_0002 1172.491 4 0.9979203 1.0029746
## 4: E-0.25 12_0002 1176.407 6 1.0012535 1.0003396
## 5: E-0.25 12_0002 1172.421 8 0.9978608 0.9995891
## ---
## 117487: N-250 14_0051 1381.819 192 1.1640162 1.1499901
## 117488: N-250 14_0051 1366.671 194 1.1512561 1.1506136
## 117489: N-250 14_0051 1323.189 196 1.1146276 1.1313087
## 117490: N-250 14_0051 1353.097 198 1.1398211 1.1216562
## 117491: N-250 14_0051 1290.181 200 1.0868222 1.1120037
# Plot first trajetory
par(mfrow=c(3,1))
plot(Yanni[Condition=="E-0.25" & Label=="12_0002", Ratio], type = "b", ylab = "value", cex.main=3, cex.axis=2.5, main = "Raw")
plot(Yanni[Condition=="E-0.25" & Label=="12_0002", Ratio.norm], type = "b", ylab = "value", cex.main=3, cex.axis=2.5, main = "Normalized")
plot(Yanni[Condition=="E-0.25" & Label=="12_0002", Ratio.norm.smooth], type = "b", ylab = "value", cex.main=3, cex.axis=2.5, main = "Normalized and smoothed")
Will use a method based on DTW, much more tailored to TS than classical Euclidean distance. In addition we use partionning around medoids instead of Kmeans, these are basically the same methods except that medoids are much less sensitive to outliers. Instead of picking the mean of a cluster as a center, medoids pick the data point that is the closest to the middle of the cluster.
Though the method is dependant on the initialization, changing the seed doesn’t affect the clustering that much.
CastCluster <- function(data, time.col, condition.col, label.col, measure.col, k.clust, na.fill, plot = T, return.quality = T, ...){
# Cast to wide, cluster and get quality indices
require(dtwclust)
temp <- myCast(data, time.col, condition.col, label.col, measure.col, na.fill)
# Make clustering, and get quality indexes
clust <- tsclust(temp$casted.matrix, type = "partitional", k = k.clust, distance = "dtw_basic", centroid = "pam", seed = 42, ...)
names(clust) <- paste0("k_", k.clust)
quality <- sapply(clust, cvi, type = "internal")
# Add a column with the clusters to the casted table
cluster.table <- temp$casted
for(k in 1:length(clust)){
cluster.table <- cbind(clust[[k]]@cluster, cluster.table)
colnames(cluster.table)[1] <- names(clust)[k]
}
# Plot
if(plot){
require(ggplot2)
mquality <- melt(quality)
names(mquality) <- c("Stat", "Nb.Clust", "value")
plot(ggplot(mquality, aes(x=Nb.Clust, y = value)) + geom_col(aes(group = Nb.Clust, fill = Nb.Clust)) + facet_grid(Stat ~ ., scales = "free_y"))
}
# Output
if(return.quality) return(list(cluster = clust, table = cluster.table, quality = quality))
else return(list(out = clust, table = cluster.table))
}
myCast <- function(data, time.col, condition.col, label.col, measure.col, na.fill){
# Only cast to wide matrix
temp <- copy(data)
# dcast can change the order of the rows depending on the orde rin which the keyed columns are passed, keep the casted table in addition to the matrix to make the link afterwards
temp <- dcast(temp, get(condition.col) + get(label.col) ~ get(time.col), value.var = measure.col)
temp2 <- as.matrix(temp[, c(-1, -2)]) # remove 2 first columns with labels
temp2[which(is.na(temp2))] <- na.fill
return(list(casted = temp, casted.matrix = temp2))
}
plot_cluster <- function(data, id.vars.col, cluster.col, type){
# Plot clusters directly from output$table of CastCluster
# id.vars.col: given in indices (include ALL clustering columns)
# cluster.col: name of the column with clustering to plot
library(ggplot2)
ids <- colnames(data)[id.vars.col]
melted <- melt(data, id.vars = ids)
if(type=="trajectory"){
ggplot(melted, aes(x = as.numeric(variable), y = value)) + geom_line(aes(group = label.col)) +
facet_wrap(as.formula(paste("~",cluster.col))) + stat_summary(fun.y=mean, geom="line", colour = "blue", size = 1.5) + xlab("Time")
} else if(type=="composition"){
melted[, c(cluster.col):=as.factor(get(cluster.col))]
ggplot(melted, aes_string(cluster.col)) + geom_bar(aes(fill=condition.col))
}
}
EGF <- Yanni[Condition %in% c("E-0.25", "E-2.5", "E-25", "E-250")]
NGF <- Yanni[Condition %in% c("N-0.25", "N-2.5", "N-25", "N-250")]
FGF <- Yanni[Condition %in% c("F-0.25", "F-2.5", "F-25", "F-250")]
cond <- "Condition"
lab <- "Label"
tim <- "RealTime"
k.clus <- 2L:8L
na.f.raw <- 1200
na.f.nor <- 1.1
Clustering quality is evaluated with the following indices (see ?dtwclust::cvi ; https://pdfs.semanticscholar.org/a522/fb4646ad19fe88893b90e6fbc1faa1470976.pdf):
Results with these indices is very often unclear and contradictory, inspecting the cluster composition can help. In a setting where one concentration gives rise to an unique response, we would expect a good clustering to contain clusters with at least 25% of the observations (since there is 4 concentrations) that are enriched/depleted in at least one concentration.
# With unormalized and unsmoothed data
clust_EGF_1 <- CastCluster(EGF, time.col = tim, condition.col = cond, label.col = lab, k.clust = k.clus, na.fill = na.f.raw, plot = F, measure.col = "Ratio")
# With normalized and unsmoothed data
clust_EGF_2 <- CastCluster(EGF, time.col = tim, condition.col = cond, label.col = lab, k.clust = k.clus, na.fill = na.f.nor, plot = F, measure.col = "Ratio.norm")
# With normalized and smoothed data
clust_EGF_3 <- CastCluster(EGF, time.col = tim, condition.col = cond, label.col = lab, k.clust = k.clus, na.fill = na.f.nor, plot = F, measure.col = "Ratio.norm.smooth")
Advised number of clusters:
raw <- c(apply(clust_EGF_1$quality[c("CH","D","SF","Sil"),], 1, which.max) + 1, apply(clust_EGF_1$quality[c("COP","DB","DBstar"),], 1, which.min) + 1)
norm <- c(apply(clust_EGF_2$quality[c("CH","D","SF","Sil"),], 1, which.max) + 1, apply(clust_EGF_2$quality[c("COP","DB","DBstar"),], 1, which.min) + 1)
smooth <- c(apply(clust_EGF_3$quality[c("CH","D","SF","Sil"),], 1, which.max) + 1, apply(clust_EGF_3$quality[c("COP","DB","DBstar"),], 1, which.min) + 1)
rbind(raw, norm, smooth)
## CH D SF Sil COP DB DBstar
## raw 3 8 2 2 8 3 3
## norm 2 5 2 2 7 2 2
## smooth 2 6 2 2 6 2 2
plot_cluster(clust_EGF_3$table, id.vars.col = 1:9, cluster.col = "k_2", type = "trajectory")
## Warning: Removed 21 rows containing non-finite values (stat_summary).
# Cluster composition with Raw data
library(gridExtra)
for(i in k.clus){
name <- paste0("k_",i)
base::assign(paste0("q",i), plot_cluster(clust_EGF_1$table, id.vars.col = 1:9, cluster.col = name, type="composition"))
}
grid.arrange(q2,q3,q4,q5,q6,q7,q8, ncol =4, nrow=2)
# With normalized smoothed data
for(i in k.clus){
name <- paste0("k_",i)
base::assign(paste0("q",i), plot_cluster(clust_EGF_3$table, id.vars.col = 1:9, cluster.col = name, type="composition"))
}
grid.arrange(q2,q3,q4,q5,q6,q7,q8, ncol =4, nrow=2)
Each cluster systematically contains the 4 concentrations, very low separabilty –> 1 type of response whatever the concentration.
clust_NGF_1 <- CastCluster(NGF, time.col = tim, condition.col = cond, label.col = lab, k.clust = k.clus, na.fill = na.f.raw, plot = F, measure.col = "Ratio")
clust_NGF_2 <- CastCluster(NGF, time.col = tim, condition.col = cond, label.col = lab, k.clust = k.clus, na.fill = na.f.nor, plot = F, measure.col = "Ratio.norm")
clust_NGF_3 <- CastCluster(NGF, time.col = tim, condition.col = cond, label.col = lab, k.clust = k.clus, na.fill = na.f.nor, plot = F, measure.col = "Ratio.norm.smooth")
Advised number of clusters:
raw <- c(apply(clust_NGF_1$quality[c("CH","D","SF","Sil"),], 1, which.max) + 1, apply(clust_NGF_1$quality[c("COP","DB","DBstar"),], 1, which.min) + 1)
norm <- c(apply(clust_NGF_2$quality[c("CH","D","SF","Sil"),], 1, which.max) + 1, apply(clust_NGF_2$quality[c("COP","DB","DBstar"),], 1, which.min) + 1)
smooth <- c(apply(clust_NGF_3$quality[c("CH","D","SF","Sil"),], 1, which.max) + 1, apply(clust_NGF_3$quality[c("COP","DB","DBstar"),], 1, which.min) + 1)
rbind(raw, norm, smooth)
## CH D SF Sil COP DB DBstar
## raw 2 7 2 2 8 2 2
## norm 2 8 2 2 8 2 2
## smooth 2 8 2 2 8 2 2
plot_cluster(clust_NGF_3$table, id.vars.col = 1:9, cluster.col = "k_2", type = "trajectory")
## Warning: Removed 41 rows containing non-finite values (stat_summary).
# Cluster composition with Raw data
library(gridExtra)
for(i in k.clus){
name <- paste0("k_",i)
base::assign(paste0("q",i), plot_cluster(clust_NGF_1$table, id.vars.col = 1:9, cluster.col = name, type="composition"))
}
grid.arrange(q2,q3,q4,q5,q6,q7,q8, ncol =4, nrow=2)
# With normalized smoothed data
for(i in k.clus){
name <- paste0("k_",i)
base::assign(paste0("q",i), plot_cluster(clust_NGF_3$table, id.vars.col = 1:9, cluster.col = name, type="composition"))
}
grid.arrange(q2,q3,q4,q5,q6,q7,q8, ncol =4, nrow=2)
Each cluster systematically contains the 4 concentrations, very low separabilty –> 1 type of response whatever the concentration.
clust_FGF_1 <- CastCluster(FGF, time.col = tim, condition.col = cond, label.col = lab, k.clust = k.clus, na.fill = na.f.raw, plot = F, measure.col = "Ratio")
clust_FGF_2 <- CastCluster(FGF, time.col = tim, condition.col = cond, label.col = lab, k.clust = k.clus, na.fill = na.f.nor, plot = F, measure.col = "Ratio.norm")
clust_FGF_3 <- CastCluster(FGF, time.col = tim, condition.col = cond, label.col = lab, k.clust = k.clus, na.fill = na.f.nor, plot = F, measure.col = "Ratio.norm.smooth")
Advised number of clusters:
raw <- c(apply(clust_FGF_1$quality[c("CH","D","SF","Sil"),], 1, which.max) + 1, apply(clust_FGF_1$quality[c("COP","DB","DBstar"),], 1, which.min) + 1)
norm <- c(apply(clust_FGF_2$quality[c("CH","D","SF","Sil"),], 1, which.max) + 1, apply(clust_FGF_2$quality[c("COP","DB","DBstar"),], 1, which.min) + 1)
smooth <- c(apply(clust_FGF_3$quality[c("CH","D","SF","Sil"),], 1, which.max) + 1, apply(clust_FGF_3$quality[c("COP","DB","DBstar"),], 1, which.min) + 1)
rbind(raw, norm, smooth)
## CH D SF Sil COP DB DBstar
## raw 2 8 2 2 8 2 2
## norm 2 8 2 2 8 2 2
## smooth 2 8 2 2 8 2 2
plot_cluster(clust_FGF_3$table, id.vars.col = 1:9, cluster.col = "k_2", type = "trajectory")
## Warning: Removed 11 rows containing non-finite values (stat_summary).
# Cluster composition with Raw data
library(gridExtra)
for(i in k.clus){
name <- paste0("k_",i)
base::assign(paste0("q",i), plot_cluster(clust_FGF_1$table, id.vars.col = 1:9, cluster.col = name, type="composition"))
}
grid.arrange(q2,q3,q4,q5,q6,q7,q8, ncol =4, nrow=2)
# With normalized smoothed data
for(i in k.clus){
name <- paste0("k_",i)
base::assign(paste0("q",i), plot_cluster(clust_FGF_3$table, id.vars.col = 1:9, cluster.col = name, type="composition"))
}
grid.arrange(q2,q3,q4,q5,q6,q7,q8, ncol =4, nrow=2)
Separability seems better than for EGF and NGF, notably concentration 0.25 and 250 tend to be taken apart.
The gap statistics is an heuristic way of estimating the number of clusters. It compares the variance explained by our clustering to the expected explained variance if there was no cluster but just noise in that specific space region. It is very computational costly but has the major advantage to also indicate whether clustering is even relevant in the first place (i.e. k=1 should be kept). See: https://datasciencelab.wordpress.com/2013/12/27/finding-the-k-in-k-means-clustering/
Prototype script:
modtsclust <- function(data, k, type, centroid){
# Interface function compatible with clusGap, returns only the result of the clustering
require(dtwclust)
temp <- tsclust(data, k = k, type = type, centroid = centroid)
return(list(cluster = temp@cluster))
}
kmax = 6L
require(cluster)
temp <- copy(EGF)
temp <- dcast(temp, Label ~ RealTime, value.var = "Ratio.norm.smooth")
temp <- as.matrix(temp[, -1]) # remove first columns with labels
temp[which(is.na(temp))] <- 1.1
cgap_EGF_smooth <- clusGap(temp, FUN=modtsclust, K.max=kmax, B=50, centroid = "pam", type = "partitional")
temp <- copy(NGF)
temp <- dcast(temp, Label ~ RealTime, value.var = "Ratio")
temp <- as.matrix(temp[, -1]) # remove first columns with labels
temp[which(is.na(temp))] <- 1.1
cgap_NGF <- clusGap(temp, FUN=modtsclust, K.max=kmax, B=100, centroid = "pam", type = "partitional")
temp <- copy(FGF)
temp <- dcast(temp, Label ~ RealTime, value.var = "Ratio.norm.smooth")
temp <- as.matrix(temp[, -1]) # remove first columns with labels
temp[which(is.na(temp))] <- 1.1
cgap_FGF_smooth <- clusGap(temp, FUN=modtsclust, K.max=kmax, B=100, centroid = "pam", type = "partitional")
for(k in 1:(kmax-1)) {
if(cgap_FGF_smooth$Tab[k,3]>cgap_FGF_smooth$Tab[(k+1),3]-cgap_FGF_smooth$Tab[(k+1),4]) {print(k)};
break;
}
# Load results
library(cluster)
temp <- "C:/Users/pixel/Desktop/Lectures/Semester_3/Master_thesis/gap_stat_output/"
temp <- paste0(temp, dir(temp))
sapply(temp, load, .GlobalEnv)
## C:/Users/pixel/Desktop/Lectures/Semester_3/Master_thesis/gap_stat_output/cgap_EGF_norm.Robj
## "cgap_EGF_norm"
## C:/Users/pixel/Desktop/Lectures/Semester_3/Master_thesis/gap_stat_output/cgap_EGF_raw.Robj
## "cgap_EGF_raw"
## C:/Users/pixel/Desktop/Lectures/Semester_3/Master_thesis/gap_stat_output/cgap_EGF_smooth.Robj
## "cgap_EGF_smooth"
## C:/Users/pixel/Desktop/Lectures/Semester_3/Master_thesis/gap_stat_output/cgap_FGF_norm.Robj
## "cgap_FGF_norm"
## C:/Users/pixel/Desktop/Lectures/Semester_3/Master_thesis/gap_stat_output/cgap_FGF_raw.Robj
## "cgap_FGF_raw"
## C:/Users/pixel/Desktop/Lectures/Semester_3/Master_thesis/gap_stat_output/cgap_FGF_smooth.Robj
## "cgap_FGF_smooth"
## C:/Users/pixel/Desktop/Lectures/Semester_3/Master_thesis/gap_stat_output/cgap_NGF_norm.Robj
## "cgap_NGF_norm"
## C:/Users/pixel/Desktop/Lectures/Semester_3/Master_thesis/gap_stat_output/cgap_NGF_raw.Robj
## "cgap_NGF_raw"
## C:/Users/pixel/Desktop/Lectures/Semester_3/Master_thesis/gap_stat_output/cgap_NGF_smooth.Robj
## "cgap_NGF_smooth"
EGF_short <- EGF[RealTime >= 25 & RealTime <= 100]
FGF_short <- FGF[RealTime >= 25 & RealTime <= 100]
NGF_short <- NGF[RealTime >= 25 & RealTime <= 100]
## CH D SF Sil COP DB DBstar
## raw 2 8 2 2 8 2 2
## norm 2 6 2 2 8 2 2
## smooth 2 7 2 2 6 2 2
## Warning in melt.data.table(as.data.table(clust_EGF_1$quality)): To be
## consistent with reshape2's melt, id.vars and measure.vars are internally
## guessed when both are 'NULL'. All non-numeric/integer/logical type columns
## are conisdered id.vars, which in this case are columns []. Consider
## providing at least one of 'id' or 'measure' vars in future.
## Warning in melt.data.table(as.data.table(clust_EGF_2$quality)): To be
## consistent with reshape2's melt, id.vars and measure.vars are internally
## guessed when both are 'NULL'. All non-numeric/integer/logical type columns
## are conisdered id.vars, which in this case are columns []. Consider
## providing at least one of 'id' or 'measure' vars in future.
## Warning in melt.data.table(as.data.table(clust_EGF_3$quality)): To be
## consistent with reshape2's melt, id.vars and measure.vars are internally
## guessed when both are 'NULL'. All non-numeric/integer/logical type columns
## are conisdered id.vars, which in this case are columns []. Consider
## providing at least one of 'id' or 'measure' vars in future.
## Warning: Removed 6 rows containing non-finite values (stat_summary).
Does a much better job at separating the concentrations. Advantages of normalized data become clear.
## CH D SF Sil COP DB DBstar
## raw 2 6 2 2 8 2 2
## norm 2 8 2 2 8 2 2
## smooth 2 8 2 2 8 2 2
## Warning in melt.data.table(as.data.table(clust_NGF_1$quality)): To be
## consistent with reshape2's melt, id.vars and measure.vars are internally
## guessed when both are 'NULL'. All non-numeric/integer/logical type columns
## are conisdered id.vars, which in this case are columns []. Consider
## providing at least one of 'id' or 'measure' vars in future.
## Warning in melt.data.table(as.data.table(clust_NGF_2$quality)): To be
## consistent with reshape2's melt, id.vars and measure.vars are internally
## guessed when both are 'NULL'. All non-numeric/integer/logical type columns
## are conisdered id.vars, which in this case are columns []. Consider
## providing at least one of 'id' or 'measure' vars in future.
## Warning in melt.data.table(as.data.table(clust_NGF_3$quality)): To be
## consistent with reshape2's melt, id.vars and measure.vars are internally
## guessed when both are 'NULL'. All non-numeric/integer/logical type columns
## are conisdered id.vars, which in this case are columns []. Consider
## providing at least one of 'id' or 'measure' vars in future.
## Warning: Removed 33 rows containing non-finite values (stat_summary).
## Warning: Removed 1 rows containing missing values (geom_path).
Still impossible to separate the concentrations properly.
## CH D SF Sil COP DB DBstar
## raw 2 8 2 2 8 2 2
## norm 2 3 2 2 8 2 2
## smooth 2 8 2 2 8 2 2
## Warning in melt.data.table(as.data.table(clust_FGF_1$quality)): To be
## consistent with reshape2's melt, id.vars and measure.vars are internally
## guessed when both are 'NULL'. All non-numeric/integer/logical type columns
## are conisdered id.vars, which in this case are columns []. Consider
## providing at least one of 'id' or 'measure' vars in future.
## Warning in melt.data.table(as.data.table(clust_FGF_2$quality)): To be
## consistent with reshape2's melt, id.vars and measure.vars are internally
## guessed when both are 'NULL'. All non-numeric/integer/logical type columns
## are conisdered id.vars, which in this case are columns []. Consider
## providing at least one of 'id' or 'measure' vars in future.
## Warning in melt.data.table(as.data.table(clust_FGF_3$quality)): To be
## consistent with reshape2's melt, id.vars and measure.vars are internally
## guessed when both are 'NULL'. All non-numeric/integer/logical type columns
## are conisdered id.vars, which in this case are columns []. Consider
## providing at least one of 'id' or 'measure' vars in future.
## Warning: Removed 6 rows containing non-finite values (stat_summary).
## Warning: Removed 1 rows containing missing values (geom_path).
Very clear separation is performed.
cast_EGF_short <- myCast(EGF_short, tim, cond, lab, "Ratio.norm.smooth", na.fill = 1.1)
# Replace the big outlier that is driving the 2nd component
cast_EGF_short$casted.matrix[which.max(cast_EGF_short$casted.matrix)] <- 1.1
pca_EGF_short <- prcomp(cast_EGF_short$casted.matrix, center = T)
library(ggbiplot)
## Loading required package: plyr
## Loading required package: scales
## Loading required package: grid
g <- ggbiplot(pca_EGF_short, obs.scale = 1, var.scale = 1,
groups = unlist(cast_EGF_short$casted[,1]), ellipse = TRUE,
circle = TRUE)
g <- g + scale_color_discrete(name = '')
g <- g + theme(legend.direction = 'horizontal',
legend.position = 'top')
print(g)
cast_NGF_short <- myCast(NGF_short, tim, cond, lab, "Ratio.norm.smooth", na.fill = 1.1)
pca_NGF_short <- prcomp(cast_NGF_short$casted.matrix, center = T)
library(ggbiplot)
g <- ggbiplot(pca_NGF_short, obs.scale = 1, var.scale = 1,
groups = unlist(cast_NGF_short$casted[,1]), ellipse = TRUE,
circle = TRUE)
g <- g + scale_color_discrete(name = '')
g <- g + theme(legend.direction = 'horizontal',
legend.position = 'top')
print(g)
cast_FGF_short <- myCast(FGF_short, tim, cond, lab, "Ratio.norm.smooth", na.fill = 1.1)
pca_FGF_short <- prcomp(cast_FGF_short$casted.matrix, center = T)
library(ggbiplot)
g <- ggbiplot(pca_FGF_short, obs.scale = 1, var.scale = 1,
groups = unlist(cast_FGF_short$casted[,1]), ellipse = TRUE,
circle = TRUE)
g <- g + scale_color_discrete(name = '')
g <- g + theme(legend.direction = 'horizontal',
legend.position = 'top')
print(g)
The PCA plots show the same separability properties as clustering does. In addition the correlation circle confirms that data gets separated after the peak. Note that the 2nd component is much more important in FGF.
An alternative representation is tsne, which gives a better separation of the classes:
library(Rtsne)
tsne <- Rtsne(X = cast_EGF_short$casted.matrix - mean(cast_EGF_short$casted.matrix), dims = 2, perplexity=30, verbose=FALSE, max_iter = 1000)
colors = rainbow(length(unique(unlist(cast_EGF_short$casted[,1]))))
plot(tsne$Y)
text(tsne$Y, labels=unlist(cast_EGF_short$casted[,1]), col = colors[unlist(cast_EGF_short$casted[,1])])
tsne <- Rtsne(X = cast_NGF_short$casted.matrix - mean(cast_NGF_short$casted.matrix), dims = 2, perplexity=30, verbose=FALSE, max_iter = 1000)
colors = rainbow(length(unique(unlist(cast_NGF_short$casted[,1]))))
plot(tsne$Y)
text(tsne$Y, labels=unlist(cast_NGF_short$casted[,1]), col = colors[unlist(cast_NGF_short$casted[,1])])
tsne <- Rtsne(X = cast_FGF_short$casted.matrix - mean(cast_FGF_short$casted.matrix), dims = 2, perplexity=30, verbose=FALSE, max_iter = 1000)
colors = rainbow(length(unique(unlist(cast_FGF_short$casted[,1]))))
plot(tsne$Y)
text(tsne$Y, labels=unlist(cast_FGF_short$casted[,1]), col = colors[unlist(cast_FGF_short$casted[,1])])
Idea: For a given GF, at each time point and for each concentration get the signal distribution, check if they overlap well between different concentrations. Overlap can be measured by KS statistics, Bhattacharryya index, Kullback–Leibler…
However this implicitly assumes that there’s no subpopulation within a given condition (GF + concentration).
sep.meas.along.time <- function(data1, data2, time.col, measure.col){
timev <- unique(data1[, get(time.col)])
if(!(identical(unique(data2[, get(time.col)]), timev))) stop("Time vectors must be identical between the two data")
out <- separability.measures(data1[get(time.col)==timev[1], get(measure.col)], data2[get(time.col)==timev[1], get(measure.col)])
for(t in timev[2:length(timev)]){
out <- rbind(out, separability.measures(data1[RealTime==t, get(measure.col)], data2[RealTime==t, get(measure.col)]))
}
out <- cbind(timev, out)
return(out)
}
library(plyr)
# Get all pairs of conditions
conditions <- combn(as.character(unique(Yanni[,Condition])), m = 2)
conditions <- conditions[,c(1:3,12,13,22, 39:41,46,47,52, 61:66)]
# Compute separabilities of conditions at each time point
sep.meas.raw <- apply(conditions, 2, function(x) sep.meas.along.time(Yanni[Condition==x[1]], Yanni[Condition==x[2]], "RealTime", "Ratio" ))
names(sep.meas.raw) <- apply(conditions, 2, function(x) paste(x[1], x[2], sep = ","))
# Go to data table
for(i in 1:length(sep.meas.raw)){
temp <- unlist(strsplit(names(sep.meas.raw)[i], ","))
sep.meas.raw[[i]]$Cond1 <- temp[1]
sep.meas.raw[[i]]$Cond2 <- temp[2]
}
sep.meas.raw <- as.data.table(rbind.fill(sep.meas.raw))
sep.meas.raw[, c("Cond1", "Cond2") := list(as.factor(Cond1), as.factor(Cond2))]
ggplot(sep.meas.raw, aes(x= timev, y = jm)) + geom_line() + facet_wrap(~Cond1 + Cond2) + theme(text=element_text(size=25)) + ylab("Jeffries-Matusita [0, sqrt(2)]") + geom_vline(xintercept=40, colour="blue", linetype="longdash")
Can sum up these curves with area under curve:
max.val <- sqrt(2) * 101
auc.raw <- sep.meas.raw[, .(auc = sum(jm, na.rm = T)/max.val), by = c("Cond1", "Cond2")] # a few NAs, slight bias in the values
auc.raw
## Cond1 Cond2 auc
## 1: E-0.25 E-2.5 0.086930074
## 2: E-0.25 E-25 0.132884272
## 3: E-0.25 E-250 0.183366617
## 4: E-2.5 E-25 0.038354825
## 5: E-2.5 E-250 0.060566879
## 6: E-25 E-250 0.023626610
## 7: F-0.25 F-2.5 0.022514830
## 8: F-0.25 F-25 0.089739650
## 9: F-0.25 F-250 0.296925989
## 10: F-2.5 F-25 0.060904139
## 11: F-2.5 F-250 0.255272578
## 12: F-25 F-250 0.093969100
## 13: N-0.25 N-2.5 0.031767259
## 14: N-0.25 N-25 0.039475775
## 15: N-0.25 N-250 0.033815373
## 16: N-2.5 N-25 0.009405366
## 17: N-2.5 N-250 0.008092665
## 18: N-25 N-250 0.014956489
The indicator seems to show the differences in behaviour really well: NGF curves are completely flat, indicating that there is few deviations between 2 concentrations. As for NGF and EGF 2 about clusters appear between bahaviours at low versus behaviours at high concentrations.
The peaks observed aorund 40min, can either come from differences in excitation level or from a shift between series, this needs to be checked. If the prior was true, should perform a multiple alignment within each concentration prior to compute the distances.
sep.meas.norm <- apply(conditions, 2, function(x) sep.meas.along.time(Yanni[Condition==x[1]], Yanni[Condition==x[2]], "RealTime", "Ratio.norm" ))
names(sep.meas.norm) <- apply(conditions, 2, function(x) paste(x[1], x[2], sep = ","))
# Go to data table
for(i in 1:length(sep.meas.norm)){
temp <- unlist(strsplit(names(sep.meas.norm)[i], ","))
sep.meas.norm[[i]]$Cond1 <- temp[1]
sep.meas.norm[[i]]$Cond2 <- temp[2]
}
sep.meas.norm <- as.data.table(rbind.fill(sep.meas.norm))
sep.meas.norm[, c("Cond1", "Cond2") := list(as.factor(Cond1), as.factor(Cond2))]
ggplot(sep.meas.norm, aes(x= timev, y = jm)) + geom_line() + facet_wrap(~Cond1 + Cond2) + theme(text=element_text(size=25)) + ylab("Jeffries-Matusita [0, sqrt(2)]") + geom_vline(xintercept=40, colour="blue", linetype="longdash")
auc.norm <- sep.meas.norm[, .(auc = sum(jm, na.rm = T)/max.val), by = c("Cond1", "Cond2")] # a few NAs, slight bias in the values
auc.norm
## Cond1 Cond2 auc
## 1: E-0.25 E-2.5 0.09572728
## 2: E-0.25 E-25 0.13078027
## 3: E-0.25 E-250 0.24245259
## 4: E-2.5 E-25 0.05833530
## 5: E-2.5 E-250 0.09841061
## 6: E-25 E-250 0.05160538
## 7: F-0.25 F-2.5 0.02399241
## 8: F-0.25 F-25 0.14924847
## 9: F-0.25 F-250 0.30922374
## 10: F-2.5 F-25 0.11881887
## 11: F-2.5 F-250 0.29071881
## 12: F-25 F-250 0.08043974
## 13: N-0.25 N-2.5 0.02954189
## 14: N-0.25 N-25 0.03388359
## 15: N-0.25 N-250 0.03452136
## 16: N-2.5 N-25 0.01081681
## 17: N-2.5 N-250 0.01714004
## 18: N-25 N-250 0.01339676
Seems that the normalization makes no good to the metric by amplifying small differences (due to shift)?
sep.meas.smooth <- apply(conditions, 2, function(x) sep.meas.along.time(Yanni[Condition==x[1]], Yanni[Condition==x[2]], "RealTime", "Ratio.norm.smooth" ))
names(sep.meas.smooth) <- apply(conditions, 2, function(x) paste(x[1], x[2], sep = ","))
# Go to data table
for(i in 1:length(sep.meas.smooth)){
temp <- unlist(strsplit(names(sep.meas.smooth)[i], ","))
sep.meas.smooth[[i]]$Cond1 <- temp[1]
sep.meas.smooth[[i]]$Cond2 <- temp[2]
}
sep.meas.smooth <- as.data.table(rbind.fill(sep.meas.smooth))
sep.meas.smooth[, c("Cond1", "Cond2") := list(as.factor(Cond1), as.factor(Cond2))]
ggplot(sep.meas.smooth, aes(x= timev, y = jm)) + geom_line() + facet_wrap(~Cond1 + Cond2) + theme(text=element_text(size=25)) + ylab("Jeffries-Matusita [0, sqrt(2)]") + geom_vline(xintercept=40, colour="blue", linetype="longdash")
auc.smooth <- sep.meas.smooth[, .(auc = sum(jm, na.rm = T)/max.val), by = c("Cond1", "Cond2")] # a few NAs, slight bias in the values
auc.smooth
## Cond1 Cond2 auc
## 1: E-0.25 E-2.5 0.10127149
## 2: E-0.25 E-25 0.13575894
## 3: E-0.25 E-250 0.25643512
## 4: E-2.5 E-25 0.05960717
## 5: E-2.5 E-250 0.11217274
## 6: E-25 E-250 0.05840472
## 7: F-0.25 F-2.5 0.06696335
## 8: F-0.25 F-25 0.14808925
## 9: F-0.25 F-250 0.31876715
## 10: F-2.5 F-25 0.08271862
## 11: F-2.5 F-250 0.18465569
## 12: F-25 F-250 0.08339709
## 13: N-0.25 N-2.5 0.04005771
## 14: N-0.25 N-25 0.03978603
## 15: N-0.25 N-250 0.04588730
## 16: N-2.5 N-25 0.01082399
## 17: N-2.5 N-250 0.01855943
## 18: N-25 N-250 0.01409727
Same as for normalization, probably better to stick to raw data if possible.
Tells us if the observed area under the curve is really related to a difference between the two treatments or if it is spurious. If there is n trajectories recorded in condition 1 and m in condition 2, the permutation test is performed like so:
We then look if the observed auc is likely in this empirical distribution.
one.permutation.auc <- function(x, y, metric){
n <- nrow(x)
m <- nrow(y)
temp <- rbind(x, y)
samp.traj <- sample(1:nrow(temp), size = n, replace = FALSE)
x.resamp <- temp[samp.traj, ]
y.resamp <- temp[setdiff(1:nrow(temp), samp.traj), ]
seps <- sapply(1:ncol(x), function(j) separability.measures(x.resamp[, j], y.resamp[, j]))
return(sum(unlist(seps[metric, ])))
}
permutation.auc <- function(x, y, n, metric = "jm"){
# x,y: two matrices representing time series, row: trajectory; col: time
# n: number of permutations
# metric: one of "jm", "bh", "div", "tdiv", "ks"
if(ncol(x) != ncol(y)) stop("x and y must have same number of columns")
return(replicate(n, one.permutation.auc(x,y,metric)))
}
wrap_perm <- function(x, y, meas, n){
a <- myCast(x, "RealTime", "Condition", "Label", meas, 1100)$casted.matrix
b <- myCast(y, "RealTime", "Condition", "Label", meas, 1100)$casted.matrix
return(permutation.auc(a, b, n))
}
temp <- apply(conditions, 2, function(x) wrap_perm(Yanni[Condition==x[1]], Yanni[Condition==x[2]], "Ratio", 500))
temp <- temp/max.val
colnames(temp) <- apply(conditions, 2, paste, collapse=" ; ")
par(mfrow=c(6,3))
for(j in 1:ncol(temp)){
hist(temp[,j], main = colnames(temp)[j], ylab="", xlab = "", cex.main = 3, cex.axis = 3)
}
In order to know whether these area under curves are robust estimates, could perform bootstraps the way it is performed in phylogenetics:
Because we deal here with comparison of 2 populations we can modify this to:
This implies that matrices are of same length and defined at the same times.
one.bootstrap.auc.percol <- function(x, y, metric){
samp.col <- sample(1:ncol(x), size = ncol(x), replace = TRUE)
x.resamp <- x[, samp.col]
y.resamp <- y[, samp.col]
seps <- sapply(1:ncol(x), function(j) separability.measures(x.resamp[, j], y.resamp[, j]))
return(sum(unlist(seps[metric, ])))
}
bootstrap.auc.percol <- function(x, y, B, metric = "jm"){
# x,y: two matrices representing time series, row: trajectory; col: time
# B: number of boostraps
# metric: one of "jm", "bh", "div", "tdiv", "ks"
if(ncol(x) != ncol(y)) stop("x and y must have same number of columns")
return(replicate(B, one.bootstrap.auc.percol(x,y,metric)))
}
wrap_bootcol <- function(x, y, meas, n){
a <- myCast(x, "RealTime", "Condition", "Label", meas, 1100)$casted.matrix
b <- myCast(y, "RealTime", "Condition", "Label", meas, 1100)$casted.matrix
return(bootstrap.auc.percol(a, b, n))
}
bootcol <- apply(conditions, 2, function(x) wrap_bootcol(Yanni[Condition==x[1]], Yanni[Condition==x[2]], "Ratio", 500))
bootcol <- bootcol/max.val
colnames(bootcol) <- apply(conditions, 2, paste, collapse=";")
par(mfrow=c(6,3))
for(j in 1:ncol(bootcol)){
hist(bootcol[,j], main = colnames(bootcol)[j], ylab="", xlab = "", cex.main = 3, cex.axis = 3)
}
We see that the average of the distribution corresponds to the observed value.
A lack of robustness here could potentially show the presence of subpopulations, that are missed when simply sum up to AUC.
one.bootstrap.auc.perrow <- function(x, y, metric){
samp.rowx <- sample(1:nrow(x), size = nrow(x), replace = TRUE)
samp.rowy <- sample(1:nrow(y), size = nrow(y), replace = TRUE)
x.resamp <- x[samp.rowx, ]
y.resamp <- y[samp.rowy, ]
seps <- sapply(1:ncol(x), function(j) separability.measures(x.resamp[, j], y.resamp[, j]))
return(sum(unlist(seps[metric, ])))
}
bootstrap.auc.perrow <- function(x, y, B, metric = "jm"){
# x,y: two matrices representing time series, row: trajectory; col: time
# B: number of boostraps
# metric: one of "jm", "bh", "div", "tdiv", "ks"
if(ncol(x) != ncol(y)) stop("x and y must have same number of columns")
return(replicate(B, one.bootstrap.auc.perrow(x,y,metric)))
}
wrap_bootrow <- function(x, y, meas, n){
a <- myCast(x, "RealTime", "Condition", "Label", meas, 1100)$casted.matrix
b <- myCast(y, "RealTime", "Condition", "Label", meas, 1100)$casted.matrix
return(bootstrap.auc.perrow(a, b, n))
}
bootrow <- apply(conditions, 2, function(x) wrap_bootrow(Yanni[Condition==x[1]], Yanni[Condition==x[2]], "Ratio", 500))
bootrow <- bootrow/max.val
colnames(bootrow) <- apply(conditions, 2, paste, collapse=" ; ")
par(mfrow=c(6,3))
for(j in 1:ncol(bootrow)){
hist(bootrow[,j], main = colnames(bootrow)[j], ylab="", xlab = "", cex.main = 3, cex.axis = 3)
}
# Get CI directly from bootstrap quantiles
ci.bootcol <- apply(bootcol, 2, quantile, probs = c(0.025, 0.975))
ci.bootrow <- apply(bootrow, 2, quantile, probs = c(0.025, 0.975))
# Plotting variables
auc.raw$ci.low.col <- ci.bootcol[1,]
auc.raw$ci.high.col <- ci.bootcol[2,]
auc.raw$ci.low.row <- ci.bootrow[1,]
auc.raw$ci.high.row <- ci.bootrow[2,]
auc.raw[, comb.cond := as.factor(paste(Cond1, Cond2, sep = ";"))]
auc.raw[, GF := as.factor(paste0(str_sub(Cond1,1,1), "GF"))]
ggplot(auc.raw, aes(x=Cond1, y=Cond2)) +
geom_raster(aes(fill=auc)) +
facet_wrap(~GF, scales = "free")
ggplot(auc.raw, aes(x=comb.cond, y=auc, group = 1)) +
geom_errorbar(aes(ymin=ci.low.col, ymax=ci.high.col)) +
geom_point(size = 5, shape=21, fill = "white" ) +
facet_grid(.~GF, scales = "free") +
ggtitle("CI based on per column bootstraps") +
theme(strip.text.x = element_text(size=30), axis.text = element_text(size=13), title = element_text(size=30))
ggplot(auc.raw, aes(x=comb.cond, y=auc, group = 1)) +
geom_errorbar(aes(ymin=ci.low.row, ymax=ci.high.row)) +
geom_point(size = 5, shape=21, fill = "white" ) +
facet_grid(.~GF, scales = "free") +
ggtitle("CI based on per row bootstraps") +
theme(strip.text.x = element_text(size=30), axis.text = element_text(size=13), title = element_text(size=30))
# Reshape to "diagonal"
dist.raw <- dcast(data=auc.raw[,1:3], formula = Cond1 ~ Cond2, value.var = "auc")
row_nams <- c(dist.raw[,1])$Cond1
dist.raw <- dist.raw[,-1]
dist.raw
## E-2.5 E-25 E-250 F-2.5 F-25 F-250
## 1: 0.08693007 0.13288427 0.18336662 NA NA NA
## 2: NA 0.03835482 0.06056688 NA NA NA
## 3: NA NA 0.02362661 NA NA NA
## 4: NA NA NA 0.02251483 0.08973965 0.2969260
## 5: NA NA NA NA 0.06090414 0.2552726
## 6: NA NA NA NA NA 0.0939691
## 7: NA NA NA NA NA NA
## 8: NA NA NA NA NA NA
## 9: NA NA NA NA NA NA
## N-2.5 N-25 N-250
## 1: NA NA NA
## 2: NA NA NA
## 3: NA NA NA
## 4: NA NA NA
## 5: NA NA NA
## 6: NA NA NA
## 7: 0.03176726 0.039475775 0.033815373
## 8: NA 0.009405366 0.008092665
## 9: NA NA 0.014956489
# Isolate EGF, go to square symmetric matrix
temp <- as.matrix(dist.raw[1:3,1:3])
rownames(temp) <- row_nams[1:3]
indices <- unique(c(rownames(temp), colnames(temp)))
mat.EGF <- matrix(0, nrow=length(indices), ncol=length(indices),dimnames = list(indices,indices))
mat.EGF[upper.tri(mat.EGF, diag = F)] <- temp[upper.tri(temp, diag=T)]
mat.EGF <- mat.EGF + t(mat.EGF)
# Hierarchical clustering
plot(hclust(as.dist(mat.EGF)))
# Isolate FGF, go to square symmetric matrix
temp <- as.matrix(dist.raw[4:6,4:6])
rownames(temp) <- row_nams[4:6]
indices <- unique(c(rownames(temp), colnames(temp)))
mat.FGF <- matrix(0, nrow=length(indices), ncol=length(indices),dimnames = list(indices,indices))
mat.FGF[upper.tri(mat.FGF, diag = F)] <- temp[upper.tri(temp, diag=T)]
mat.FGF <- mat.FGF + t(mat.FGF)
# Hierarchical clustering
plot(hclust(as.dist(mat.FGF)))
# Isolate FGF, go to square symmetric matrix
temp <- as.matrix(dist.raw[7:9,7:9])
rownames(temp) <- row_nams[7:9]
indices <- unique(c(rownames(temp), colnames(temp)))
mat.NGF <- matrix(0, nrow=length(indices), ncol=length(indices),dimnames = list(indices,indices))
mat.NGF[upper.tri(mat.NGF, diag = F)] <- temp[upper.tri(temp, diag=T)]
mat.NGF <- mat.NGF + t(mat.NGF)
# Hierarchical clustering
plot(hclust(as.dist(mat.NGF)))
Idea: For each GF, compare each pair of trajectories that do not belong to the same concentrations. This comparison can be done by cross-correlation/overlap of clipping trajectories, simple correlations… Cross-correlations provides a way to align the signals as best as we can be shifting this is cool to get rid of differences that would come from experimental conditions like we’ve been waiting 5 more minutes to inject the GF.
Could also check coherence (cross-correlation based on spectral densities).
Idea: Quantize the signals and check how complex they are. Pool all concentrations of a single GF together and see how complex the resulting image is. We can quantize the signals and compute the entropy, or do it directly from a heatmap (quantization is then performed by color coding).
We are interested in quantifying the noise of the response for one GF in one concentration. We can use the metrics used for assessing synchrony of signals.
# Must add a row for each missing value to long data table
# 1) Perform left outer join with a table that contains all RealTime
temp <- CJ(Condition=unique(Yanni$Condition), Label=unique(Yanni$Label), RealTime=seq(0,200,2))
temp <- merge(temp, Yanni, by = c("Condition", "Label", "RealTime"), all.x = T)
# 2) Trim the rows that contains only NA (i.e. this combination of Label and Condition does not exist in the real data, example with Label == "21_0010")
Yanni.with.na <- temp[, if(!all(is.na(Ratio))) .SD, by = .(Condition, Label)]
rm(temp)
# 3) Replace NA with a predefined value (0)
for (i in seq_along(Yanni.with.na)) set(Yanni.with.na, i=which(is.na(Yanni.with.na[[i]])), j=i, value=0)
Yanni_mean.raw <- dist_mean(data = Yanni.with.na, condition = "Condition", label = "Label", measure = "Ratio", tcol = "RealTime", return.mean = F)
Yanni_mean.smooth <- dist_mean(data = Yanni.with.na, condition = "Condition", label = "Label", measure = "Ratio.norm.smooth", tcol = "RealTime", return.mean = F)
ggplot(Yanni_mean.raw, aes(x=Condition, y=euclid_to_mean)) + geom_boxplot() + ggtitle("Distance to mean trajectory with raw ratio")
ggplot(Yanni_mean.smooth, aes(x=Condition, y=euclid_to_mean)) + geom_boxplot() + ggtitle("Distance to mean trajectory with noramlized smoothed ratio")
Yanni_pw.raw <- all_pairwise_stats(data = Yanni.with.na, condition = "Condition", label = "Label", measure = "Ratio", k_roll_mean = 5)
Yanni_pw.smooth <- all_pairwise_stats(data = Yanni.with.na, condition = "Condition", label = "Label", measure = "Ratio.norm.smooth", k_roll_mean = 5)
temp <- melt(Yanni_pw.raw, id.vars = c("Condition", "Label1", "Label2"))
ggplot(temp, aes(x=Condition, y = value)) +
geom_boxplot() +
facet_wrap("variable") +
ggtitle("Pairwise correlations and overlap of clipped trajectories using raw ratio") +
theme(text = element_text(size= 20))
temp <- melt(Yanni_pw.smooth, id.vars = c("Condition", "Label1", "Label2"))
ggplot(temp, aes(x=Condition, y = value)) +
geom_boxplot() +
facet_wrap("variable") +
ggtitle("Pairwise correlations and overlap of clipped trajectories using normalized, smoothed ratio") +
theme(text = element_text(size= 20))
Overlap of clipped trajectories was designed for oscillating data, doesn’t make sense to use that here.
pairwise_dtw <- function(data, condition, label, measure){
require(data.table)
setkeyv(data, c(condition, label))
# Number of rows for data table initialization, sum of number of pairs in each condition
nber_row <- 0
for(i in unique(data[, get(condition)])){
nber_row <- nber_row + choose(length(unique(data[.(i), get(label)])), 2)
}
# One row = one pair in one condition; set column types according to input types
out <- data.table(matrix(ncol = 4, nrow = nber_row))
colnames(out) <- c(condition, "Label1", "Label2", "DTW")
if(class(data[,get(condition)]) == "integer"){out[[condition]] <- as.integer(out[[condition]])}
else if(class(data[,get(condition)]) == "numeric"){out[[condition]] <- as.numeric(out[[condition]])}
else if(class(data[,get(condition)]) == "character"){out[[condition]] <- as.character(out[[condition]])}
else if(class(data[,get(condition)]) == "factor"){out[[condition]] <- as.factor(out[[condition]])}
if(class(data[,get(label)]) == "integer"){
out[, Label1 := as.integer(Label1)]
out[, Label2 := as.integer(Label2)]
}
else if(class(data[,get(label)]) == "numeric"){
out[, Label1 := as.numeric(Label1)]
out[, Label2 := as.numeric(Label2)]
}
else if(class(data[,get(label)]) == "character"){
out[, Label1 := as.character(Label1)]
out[, Label2 := as.character(Label2)]
}
else if(class(data[,get(label)]) == "factor"){
out[, Label1 := as.factor(Label1)]
out[, Label2 := as.factor(Label2)]
}
out[, DTW := as.numeric(DTW)]
curr_row <- 1L
# Loop condition
for(i in unique(data[, get(condition)])){
labels <- unique(data[.(i), get(label)])
# Loop 1st label
for(j in 1:(length(labels)-1)){
# Loop 2nd label
for(k in (j+1):length(labels)){
set(out, curr_row, 1L, i)
set(out, curr_row, 2L, labels[j])
set(out, curr_row, 3L, labels[k])
set(out, curr_row, 4L, dtw(data[.(i, labels[j]), get(measure)], data[.(i, labels[k]), get(measure)])$distance)
curr_row <- curr_row + 1L
}
}
}
return(out)
}
Yanni_dtw.raw <- pairwise_dtw(data = Yanni.with.na, condition = "Condition", label = "Label", measure = "Ratio")
Yanni_dtw.smooth <- pairwise_dtw(data = Yanni.with.na, condition = "Condition", label = "Label", measure = "Ratio.norm.smooth")
ggplot(Yanni_dtw.raw, aes(x=Condition, y=log(DTW))) + geom_boxplot() + ggtitle("Pairwise DTW, Euclidean, raw ratio")
ggplot(Yanni_dtw.smooth, aes(x=Condition, y=DTW)) + geom_boxplot() + ggtitle("Pairwise DTW, Euclidean, normalized smooth ratio")